/*******************************************************************************
 Copyright (c) 2021-2023 Arm  Corporation All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

   * Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
   * Neither the name of Intel Corporation nor the names of its contributors
     may be used to endorse or promote products derived from this software
     without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#ifndef _ZUC_SOBX_INC_
#define _ZUC_SOBX_INC_

#include "aarch64/aesni_emu_aarch64.inc"

.section .data
.align 4
.type	P1, %object
P1:
	.byte 0x09, 0x0F, 0x00, 0x0E, 0x0F, 0x0F, 0x02, 0x0A, 0x00, 0x04, 0x00, 0x0C, 0x07, 0x05, 0x03, 0x09
.size	P1,.-P1

.align 4
.type	P2, %object
P2:
	.byte 0x08, 0x0D, 0x06, 0x05, 0x07, 0x00, 0x0C, 0x04, 0x0B, 0x01, 0x0E, 0x0A, 0x0F, 0x03, 0x09, 0x02
.size	P2,.-P2

.align 4
.type	P3, %object
P3:
	.byte 0x02, 0x06, 0x0A, 0x06, 0x00, 0x0D, 0x0A, 0x0F, 0x03, 0x03, 0x0D, 0x05, 0x00, 0x09, 0x0C, 0x0D
.size	P3,.-P3

.align 4
.type Aes_to_Zuc_mul_low_nibble, %object
Aes_to_Zuc_mul_low_nibble:
	.byte 0x00, 0x01, 0x82, 0x83, 0x9e, 0x9f, 0x1c, 0x1d, 0x24, 0x25, 0xa6, 0xa7, 0xba, 0xbb, 0x38, 0x39
.size	Aes_to_Zuc_mul_low_nibble,.-Aes_to_Zuc_mul_low_nibble

.align 4
.type Aes_to_Zuc_mul_high_nibble, %object
Aes_to_Zuc_mul_high_nibble:
	.byte 0x00, 0xd5, 0x08, 0xdd, 0x7c, 0xa9, 0x74, 0xa1, 0x9c, 0x49, 0x94, 0x41, 0xe0, 0x35, 0xe8, 0x3d
.size	Aes_to_Zuc_mul_high_nibble,.-Aes_to_Zuc_mul_high_nibble

.align 4
.type Shuf_mask, %object
Shuf_mask:
	.byte 0x00, 0x0D, 0x0A, 0x07, 0x04, 0x01, 0x0e, 0x0b, 0x08, 0x05, 0x02, 0x0f, 0x0C, 0x09, 0x06, 0x03
.size	Shuf_mask,.-Shuf_mask

.align 4
.type Comb_matrix_mul_low_nibble, %object
Comb_matrix_mul_low_nibble:
	.byte 0x55, 0x41, 0xff, 0xeb, 0x24, 0x30, 0x8e, 0x9a, 0xe2, 0xf6, 0x48, 0x5c, 0x93, 0x87, 0x39, 0x2d
.size Comb_matrix_mul_low_nibble,.-Comb_matrix_mul_low_nibble

.align 4
.type Comb_matrix_mul_high_nibble, %object
Comb_matrix_mul_high_nibble:
	.byte 0x55, 0xba, 0xcc, 0x23, 0x15, 0xfa, 0x8c, 0x63, 0x09, 0xe6, 0x90, 0x7f, 0x49, 0xa6, 0xd0, 0x3f
.size Comb_matrix_mul_high_nibble,.-Comb_matrix_mul_high_nibble

#define xPage 	x23

.macro MUL_TBL_NEON vIN, vLO, vHI_OUT, vTMP
	movi	\vTMP\().16b, 0x0F
	and	\vTMP\().16b, \vIN\().16b, \vTMP\().16b
	ushr	\vIN\().16b, \vIN\().16b, #4

	tbl	\vLO\().16b, {\vLO\().16b}, \vTMP\().16b
	tbl	\vHI_OUT\().16b, {\vHI_OUT\().16b}, \vIN\().16b

	eor	\vHI_OUT\().16b, \vHI_OUT\().16b, \vLO\().16b
.endm
/*
 * Compute 16 S0 box values from 16 bytes, stored in SIMD register
 */
.macro	S0_compute_NEON	IN_OUT, vTMP1, vTMP2
	movi	\vTMP2\().16b, 0x0F
	ushr	\vTMP1\().16b, \IN_OUT\().16b, #4                   // x1
	and	\IN_OUT\().16b, \IN_OUT\().16b, \vTMP2\().16b           // x2

	adrp	xPage, P1
	add	xPage, xPage, #:lo12:P1
	ld1	{\vTMP2\().16b}, [xPage]
	tbl	\vTMP2\().16b, {\vTMP2\().16b}, \IN_OUT\().16b          // P1[x2]
	eor	\vTMP2\().16b, \vTMP2\().16b, \vTMP1\().16B             // q = x1 ^ P1[x2]

	adrp	xPage, P2
	add	xPage, xPage, #:lo12:P2
	ld1	{\vTMP1\().16b}, [xPage]
	tbl	\vTMP1\().16b, {\vTMP1\().16b}, \vTMP2\().16b           // P2[q]
	eor	\vTMP1\().16b, \vTMP1\().16b, \IN_OUT\().16B            // r = x2 ^ P2[q]

	adrp	xPage, P3
	add	xPage, xPage, #:lo12:P3
	ld1	{\IN_OUT\().16b}, [xPage]
	tbl	\IN_OUT\().16b, {\IN_OUT\().16b}, \vTMP1\().16b         // P3[r]
	eor	\IN_OUT\().16b, \IN_OUT\().16b, \vTMP2\().16B           // s = q ^ P3[r]

	// s << 4 (since high nibble of each byte is 0, no masking is required)
	shl	\IN_OUT\().2d, \IN_OUT\().2d, #4
	orr	\vTMP1\().16b, \IN_OUT\().16b, \vTMP1\().16b           // t = (s << 4) | r

	// Rotate left 5 bits in each byte, within a SIMD register
	ushr	\IN_OUT\().16b, \vTMP1\().16b, #3
	sli	\IN_OUT\().16b, \vTMP1\().16b, #5
.endm


#ifndef INTEL_AESNCLAST
.macro AESNCLAST_AS_ARM vsrc_dst, vkey, vtemp
	eor     \vtemp\().16b, \vtemp\().16b, \vtemp\().16b
	aese    \vsrc_dst\().16b, \vtemp\().16b
	eor     \vsrc_dst\().16b, \vkey\().16b, \vsrc_dst\().16b
.endm
#define INTEL_AESNCLAST AESNCLAST_AS_ARM
#endif

/*
 * Compute 16 S1 box values from 16 bytes, stored in SIMD register
 */
.macro	S1_compute_NEON	vIN_OUT, vTMP1, vTMP2, vTMP3
	adrp	xPage, Aes_to_Zuc_mul_low_nibble
	add	xPage, xPage, #:lo12:Aes_to_Zuc_mul_low_nibble
	ld1	{\vTMP1\().16b}, [xPage]

	adrp	xPage, Aes_to_Zuc_mul_high_nibble
	add	xPage, xPage, #:lo12:Aes_to_Zuc_mul_high_nibble
	ld1	{\vTMP2\().16b}, [xPage]

	MUL_TBL_NEON \vIN_OUT, \vTMP1, \vTMP2, \vTMP3

	adrp	xPage, Shuf_mask
	add	xPage, xPage, #:lo12:Shuf_mask
	ld1	{\vTMP1\().16b}, [xPage]
	tbl	\vTMP1\().16b, {\vTMP2\().16b}, \vTMP1\().16b

	movi	\vTMP2\().16b, 0x63

	INTEL_AESNCLAST \vTMP1, \vTMP2, \vTMP3

	adrp	xPage, Comb_matrix_mul_low_nibble
	add	xPage, xPage, #:lo12:Comb_matrix_mul_low_nibble
	ld1	{\vTMP2\().16b}, [xPage]

	adrp	xPage, Comb_matrix_mul_high_nibble
	add	xPage, xPage, #:lo12:Comb_matrix_mul_high_nibble
	ld1	{\vIN_OUT\().16b}, [xPage]

	MUL_TBL_NEON \vTMP1, \vTMP2, \vIN_OUT, \vTMP3

	movi	\vTMP3\().16b, 0x55
	eor	\vIN_OUT\().16b, \vIN_OUT\().16b, \vTMP3\().16b
.endm

/*
 * Compute 16 S1 box values from 16 bytes, stored in SIMD register
 */
.macro	S1_compute_NEON_NO_AESNI	vIN_OUT, vTMP1, vTMP2, vTMP3
	adrp	xPage, Aes_to_Zuc_mul_low_nibble
	add	xPage, xPage, #:lo12:Aes_to_Zuc_mul_low_nibble
	ld1	{\vTMP1\().16b}, [xPage]

	adrp	xPage, Aes_to_Zuc_mul_high_nibble
	add	xPage, xPage, #:lo12:Aes_to_Zuc_mul_high_nibble
	ld1	{\vTMP2\().16b}, [xPage]

	MUL_TBL_NEON \vIN_OUT, \vTMP1, \vTMP2, \vTMP3

	adrp	xPage, Shuf_mask
	add	xPage, xPage, #:lo12:Shuf_mask
	ld1	{\vTMP1\().16b}, [xPage]
	tbl	\vTMP1\().16b, {\vTMP2\().16b}, \vTMP1\().16b

	movi	\vTMP2\().16b, 0x63


	EMULATE_AESENCLAST \vTMP1, \vTMP2, \vTMP3

	adrp	xPage, Comb_matrix_mul_low_nibble
	add	xPage, xPage, #:lo12:Comb_matrix_mul_low_nibble
	ld1	{\vTMP2\().16b}, [xPage]

	adrp	xPage, Comb_matrix_mul_high_nibble
	add	xPage, xPage, #:lo12:Comb_matrix_mul_high_nibble
	ld1	{\vIN_OUT\().16b}, [xPage]

	MUL_TBL_NEON \vTMP1, \vTMP2, \vIN_OUT, \vTMP3

	movi	\vTMP3\().16b, 0x55
	eor	\vIN_OUT\().16b, \vIN_OUT\().16b, \vTMP3\().16b
.endm
#endif // ifndef _ZUC_SOBX_INC_
